library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------ tidyverse 1.2.1 --
## v ggplot2 3.1.0 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.8
## v tidyr 0.8.2 v stringr 1.3.1
## v readr 1.2.1 v forcats 0.3.0
## -- Conflicts --------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(ggrepel)
library(broom)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
theme_set(theme_light())
recent_grads <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018-10-16/recent-grads.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## Major = col_character(),
## Major_category = col_character()
## )
## See spec(...) for full column specifications.
majors_processed <- recent_grads %>%
arrange(desc(Median)) %>%
mutate(Major = str_to_title(Major),
Major = fct_reorder(Major, Median))
#consider using summarize_at(vars(Total, Men, Women), sum)
categories_processed <- majors_processed %>%
filter(!is.na(Total)) %>% #Food science has an NA here!
group_by(Major_category) %>%
summarize(Total = sum(Total),
Men = sum(Men),
Women = sum(Women),
ShareWomen = Women/Total,
Samples = sum(Sample_size),
Median_wtd = sum(Median * Sample_size) / sum(Sample_size),
P25th_wtd = sum(P25th * Sample_size) / sum(Sample_size),
P75th_wtd = sum(P75th * Sample_size) / sum(Sample_size))
Keep in mind the limitations of what we are looking at here
Hmisc::describe(majors_processed$Sample_size)
## majors_processed$Sample_size
## n missing distinct Info Mean Gmd .05 .10
## 173 0 147 1 356.1 491 7.0 14.4
## .25 .50 .75 .90 .95
## 39.0 130.0 338.0 1028.0 1668.6
##
## lowest : 2 3 4 5 7, highest: 2394 2554 2584 2684 4212
categories_processed %>%
gather(Gender, Number, Men, Women) %>%
mutate(Major_category = fct_reorder(Major_category, Total)) %>%
ggplot(aes(Major_category, Number, fill = Gender)) +
geom_col() +
coord_flip() +
labs(x = "", y = "")
majors_processed %>%
arrange(desc(Total)) %>%
head(25) %>%
gather(Gender, Number, Men, Women) %>%
mutate(Major = fct_reorder(Major, Total)) %>%
ggplot(aes(Major, Number, fill = Gender)) +
geom_col() +
scale_y_continuous(labels = comma_format()) +
coord_flip() +
labs(x = "", y = "")
Only taking majors over the 10th precentile of majors (15)
majors_processed %>%
filter(Sample_size > 15) %>%
mutate(Major_category = fct_reorder(Major_category, Median)) %>%
ggplot(aes(Major_category, Median, fill = Major_category)) +
geom_boxplot() +
scale_y_continuous(labels = dollar_format()) +
coord_flip() +
expand_limits(y = 0) +
theme(legend.position = "none")
Plotting Top 20 Median Earners of those 50th precentile of sample size (130) Plot shows median (point) with interquartile range (25th - 75th quartiles)
majors_processed %>%
filter(Sample_size >= 130) %>%
head(20) %>%
ggplot(aes(Major, Median, color = Major_category)) +
geom_point() +
geom_errorbar(aes(ymin = P25th, ymax = P75th)) +
scale_y_continuous(labels = dollar_format()) +
coord_flip() +
expand_limits(y = 0:80000)
Plotting Bottom 20 Median Earners of those 50th precentile of sample size (130) Plot shows median (point) with interquartile range (25th - 75th quartiles)
majors_processed %>%
filter(Sample_size >= 130) %>%
tail(20) %>%
ggplot(aes(Major, Median, color = Major_category)) +
geom_point() +
geom_errorbar(aes(ymin = P25th, ymax = P75th)) +
scale_y_continuous(labels = dollar_format()) +
coord_flip() +
expand_limits(y = 0:80000)
I took the 750 Sample_size as my cutoff because the graph was pretty unreadbale unless I did that I chose this number because the majors after this point seemed more familiar Dot is median, and bars are 25th - 75th precentiles
majors_processed %>%
filter(Sample_size >= 750) %>%
ggplot(aes(Major, Median, color = Major_category)) +
geom_point() +
geom_errorbar(aes(ymin = P25th, ymax = P75th)) +
scale_y_continuous(labels = dollar_format()) +
coord_flip() +
expand_limits(y = 0) +
labs(x = "")
g <- majors_processed %>%
filter(!is.na(Total)) %>%
mutate(Major_category = fct_lump(Major_category, 4),
Major_category = fct_relevel(Major_category, "Other", after = 0)) %>%
ggplot(aes(ShareWomen, Median, color = Major_category, size = Sample_size, label = Major)) +
geom_point() +
geom_smooth(aes(group = 1), method = lm) +
scale_color_brewer(palette = "Dark2") +
scale_y_continuous(labels = dollar_format()) +
expand_limits(y = 0)
ggplotly(g)
50th precentile of sample size as cutoff (130+)
g <- majors_processed %>%
filter(!is.na(Total) & Sample_size >= 130) %>%
mutate(Major_category = fct_lump(Major_category, 4),
Major_category = fct_relevel(Major_category, "Other", after = 0)) %>%
ggplot(aes(ShareWomen, Median, color = Major_category, size = Sample_size, label = Major)) +
geom_point() +
geom_smooth(aes(group = 1), method = lm) +
scale_color_brewer(palette = "Dark2") +
scale_y_continuous(labels = dollar_format()) +
expand_limits(y = 0)
ggplotly(g)
Weighted on sample size, so nothing filtered out on sample size Weighted linear regression expects MEANs not MEDIANs, so this is borked from the start
majors_processed %>%
lm(Median ~ ShareWomen, data = ., weights = Sample_size) %>%
summary()
##
## Call:
## lm(formula = Median ~ ShareWomen, data = ., weights = Sample_size)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -260500 -61042 -13899 33262 865081
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52073 1436 36.255 <2e-16 ***
## ShareWomen -23650 2403 -9.842 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 123000 on 170 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.363, Adjusted R-squared: 0.3592
## F-statistic: 96.87 on 1 and 170 DF, p-value: < 2.2e-16
majors_processed %>%
filter(Sample_size >= 130 & !is.na(Total)) %>%
lm(Median ~ ShareWomen, data = ., weights = Sample_size) %>%
summary()
##
## Call:
## lm(formula = Median ~ ShareWomen, data = ., weights = Sample_size)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -260877 -88878 -27618 32049 863618
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52063 1976 26.347 < 2e-16 ***
## ShareWomen -23606 3299 -7.156 2.74e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 160900 on 85 degrees of freedom
## Multiple R-squared: 0.3759, Adjusted R-squared: 0.3686
## F-statistic: 51.21 on 1 and 85 DF, p-value: 2.739e-10
Seems like women tend to receive degrees in lower earning majors :thinking: Bottom line: If your major went from 100% men to 100% women, you could expect the median expected salary to go down by $23,650 In other words: Every precentage point of men in a major is ~$237 increase in median expected salary
Looking at the categories and share women Consider using logarithmic vs polynomial vs linear Rememeber that this median wage is WEIGHTED by the sample size Linear regression is then further weighted by sample size
categories_processed %>%
ggplot(aes(ShareWomen, Median_wtd)) +
geom_point() +
geom_smooth(method = lm) +
geom_text_repel(aes(label = Major_category), force = 8)
categories_processed %>%
lm(Median_wtd ~ ShareWomen, data = ., weights = Samples) %>%
summary()
##
## Call:
## lm(formula = Median_wtd ~ ShareWomen, data = ., weights = Samples)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -366854 -204031 -70545 27914 800848
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 55002 4453 12.352 6.45e-09 ***
## ShareWomen -28804 7597 -3.791 0.00198 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 311600 on 14 degrees of freedom
## Multiple R-squared: 0.5066, Adjusted R-squared: 0.4714
## F-statistic: 14.37 on 1 and 14 DF, p-value: 0.001984